# ============================================================
# Home Data for ML Course (Kaggle Learn Users)
# CatBoost 最終調整版（RMSE学習 + depth探索）
#
# 方針：
#  - loss_function="RMSE"（評価指標と学習を一致させる）
#  - log1pで学習 → expm1で戻す（あなたの13291系と同じ思想）
#  - depth 7/8/9/10 を軽く試して「このデータに合う深さ」を選ぶ
#  - iterations多め + early stopping で最適地点に自動停止
#
# 出力：
#  - 最良depthで学習した submission.csv
# ============================================================

import numpy as np
import pandas as pd

from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from catboost import CatBoostRegressor, Pool

# ----------------------------
# 1) Load
# ----------------------------
train = pd.read_csv("/kaggle/input/home-data-for-ml-course/train.csv")
test  = pd.read_csv("/kaggle/input/home-data-for-ml-course/test.csv")

# ----------------------------
# 2) Outlier removal（定番）
# ----------------------------
train = train.drop(train[(train["GrLivArea"] > 4000) & (train["SalePrice"] < 300000)].index)

y = train["SalePrice"].copy()
X = train.drop(columns=["SalePrice"]).copy()
X_test = test.copy()

# ----------------------------
# 3) Feature engineering（軽量で効くものだけ）
# ----------------------------
def add_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    for c in ["TotalBsmtSF", "1stFlrSF", "2ndFlrSF", "GarageArea"]:
        if c in df.columns:
            df[c] = df[c].fillna(0)

    df["TotalSF"] = df.get("TotalBsmtSF", 0) + df.get("1stFlrSF", 0) + df.get("2ndFlrSF", 0)

    if "YrSold" in df.columns and "YearBuilt" in df.columns:
        df["HouseAge"] = df["YrSold"] - df["YearBuilt"]
    if "YrSold" in df.columns and "YearRemodAdd" in df.columns:
        df["RemodAge"] = df["YrSold"] - df["YearRemodAdd"]

    for c in ["FullBath", "HalfBath", "BsmtFullBath", "BsmtHalfBath"]:
        if c in df.columns:
            df[c] = df[c].fillna(0)
    if all(c in df.columns for c in ["FullBath","HalfBath","BsmtFullBath","BsmtHalfBath"]):
        df["TotalBath"] = df["FullBath"] + 0.5*df["HalfBath"] + df["BsmtFullBath"] + 0.5*df["BsmtHalfBath"]

    if "OverallQual" in df.columns and "GrLivArea" in df.columns:
        df["Qual_x_GrLivArea"] = df["OverallQual"] * df["GrLivArea"]
    if "OverallQual" in df.columns:
        df["Qual_x_TotalSF"] = df["OverallQual"] * df["TotalSF"]

    return df

X = add_features(X)
X_test = add_features(X_test)

# ----------------------------
# 4) Categorical handling（堅牢）
# ----------------------------
cat_cols = X.select_dtypes(include=["object"]).columns.tolist()
cat_idx = [X.columns.get_loc(c) for c in cat_cols]

# objectの欠損は文字埋め（安全）
X[cat_cols] = X[cat_cols].fillna("Missing")
X_test[cat_cols] = X_test[cat_cols].fillna("Missing")

# 数値欠損は中央値（trainで決めてtestにも適用）
num_cols = X.columns.difference(cat_cols).tolist()
med = X[num_cols].median()
X[num_cols] = X[num_cols].fillna(med)
X_test[num_cols] = X_test[num_cols].fillna(med)

# ----------------------------
# 5) Target transform：log1p
# ----------------------------
y_log = np.log1p(y)

# ----------------------------
# 6) depth を軽く探索
# ----------------------------
kf = KFold(n_splits=5, shuffle=True, random_state=42)

depth_candidates = [7, 8, 9, 10]  # ここを増やすなら [6..10] などでもOK

best_depth = None
best_cv = float("inf")
best_test_pred = None

for depth in depth_candidates:
    oof_log = np.zeros(len(X))
    test_pred = np.zeros(len(X_test))

    for fold, (tr_idx, va_idx) in enumerate(kf.split(X), start=1):
        X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
        y_tr, y_va = y_log.iloc[tr_idx], y_log.iloc[va_idx]

        train_pool = Pool(X_tr, y_tr, cat_features=cat_idx)
        valid_pool = Pool(X_va, y_va, cat_features=cat_idx)
        test_pool  = Pool(X_test, cat_features=cat_idx)

        # --- RMSE学習（評価と一致）---
        model = CatBoostRegressor(
            loss_function="RMSE",
            eval_metric="RMSE",
            iterations=50000,
            learning_rate=0.02,
            depth=depth,
            l2_leaf_reg=3.0,
            random_seed=42,
            od_type="Iter",
            od_wait=600,      # 少し長めに待つ（小刻み改善を拾う）
            verbose=0
        )

        model.fit(train_pool, eval_set=valid_pool, use_best_model=True)

        pred_va_log = model.predict(valid_pool)
        oof_log[va_idx] = pred_va_log

        # testはpriceに戻して平均
        test_pred += np.expm1(model.predict(test_pool)) / kf.n_splits

    # CV評価（price空間RMSE）
    cv_rmse = mean_squared_error(np.expm1(y_log), np.expm1(oof_log), squared=False)
    print(f"[depth={depth}] CV RMSE(price): {cv_rmse:.2f}")

    if cv_rmse < best_cv:
        best_cv = cv_rmse
        best_depth = depth
        best_test_pred = test_pred

print("\n✅ Best depth:", best_depth, " / CV RMSE(price):", f"{best_cv:.2f}")

# ----------------------------
# 7) Submission（最良depthのtest予測で提出）
# ----------------------------
submission = pd.DataFrame({"Id": test["Id"], "SalePrice": best_test_pred})
submission.to_csv("submission.csv", index=False)
print("✅ saved: submission.csv")
